* Steps: 
** 1) GENERATE SUITABILITY INDEX
** 2) CREATE IMR AND U5MR
** 3) IDENTIFY START AND COMPLETION YEARS
** 4) CREATE ADDITIONAL VARIABLES FOR APPENDIX
** 5) LABEL VARIABLES 
** 6) CREATE VARIABLES FOR EVENT STUDY
** 7) SET SAMPLE 

*------------------------------------------------------------------------------
*1) GENERATE SUITABILITY INDEX
*------------------------------------------------------------------------------	

use "$data/analysis_district", clear
sort ubigeo year

		*Principal component analysis
		preserve
		keep if year==2005
		
		global cost "geo_slope1_perc geo_slope2_perc geo_slope3_perc geo_slope4_perc geo_elev_250perc geo_elev_251500perc geo_elev_5011000perc geo_elev_1001perc river_leng geo_area"

		corr $cost
		
		cap drop pscore
		pca $cost , level(95)  mineigen(1) 
		predict pscore
		
		sort pscore
	
		*Normalize indicator
		qui sum pscore
		replace pscore = (pscore - `r(min)') / (`r(max)'-`r(min)')
	
		keep if pscore!=.
		sort pscore
		keep ubigeo pscore
		
		*Inverse geo suitability index
		gen pscore2=1-pscore
		sort pscore2
		
		lab var pscore "Suitability index" 
		lab var pscore2 "Low suitability index"
		
		sort ubigeo
		
		tempfile geo
		save `geo'
		
		restore
		
use "$data/analysis_district", clear
sort ubigeo year		

merge m:1 ubigeo using `geo',  nogen keep(1 3)


*------------------------------------------------------------------------------
*2) CREATE IMR AND U5MR
*------------------------------------------------------------------------------		

*Infant mortality
gen vs_imr1y= (mort_less1y/(pop_u5y/5))*$mr_per

*Under 5 mortality
gen vs_u5mr= (mort_u5y/pop_u5y)*$mr_per

label var vs_imr1y "IMR (per $mr_per infants)"
label var vs_u5mr "U5MR (per $mr_per under-fives)"

local cod "infec peri dige endoc san other neop nerv circ resp skin musc geniuri malf notclsf accid"
foreach c of local cod {
	
gen vs_imr1y_`c'=(mort_1y_`c'/(pop_u5y/5))*$mr_per
gen vs_u5mr_`c'=(mort_u5y_`c'/pop_u5y)*$mr_per

lab var vs_imr1y_`c' "IMR (per $mr_per infants) - `c'"
lab var vs_u5mr_`c' "U5MR (per $mr_per infants) - `c'"
}

gen pop_u5y_1y=pop_u5y/5 if pop_u5y!=.
label var pop_u5y_1y "Infant population (computed)"

*Alternative denominator IMR: weighting by share of infants/u5 in 2005 census in each district instead of dividing by /5
gen pop_u1y=pop_u1y5y05*pop_u5y
gen vs_imr1yA= (mort_less1y/pop_u1y)*$mr_per

label var vs_imr1yA "Alternative IMR (per $mr_per infants)"
lab var pop_u5y "Population underfive (forecasted)"
lab var pop_u1y "Population underfive (weighted by share of infants in u5pop in 2005 census)"

local cod "infec peri dige endoc san other neop nerv circ resp skin musc geniuri malf notclsf accid"
foreach c of local cod {
gen vs_imr1yA_`c'=(mort_1y_`c'/pop_u1y)*$mr_per
lab var vs_imr1yA_`c' "Alternative IMR (per $mr_per infants) - `c'"
}


*------------------------------------------------------------------------------
*3) IDENTIFY START AND COMPLETION YEARS
*------------------------------------------------------------------------------	

*Year implementation started (first year first project started)
bysort ubigeo: gen p_y0_min=1 if s0!=0 & s0[_n-1]==0 
replace p_y0_min=1 if p_y0_min==. & s0!=0 & year==2005
replace p_y0_min=year if p_y0_min==1
bysort ubigeo: egen p_y0_min_X=max(p_y0_min)
drop p_y0_min
rename p_y0_min_X p_y0_min
cap drop s0_d
sort ubigeo year
bysort ubigeo year: gen s0_d=p_y0_min<=year if p_y0_min!=.
lab var s0_d "Flag - Year implementation started in district"
rename p_y0_min s0_year

*Year implementation finished (year all projects are completed)
cap drop s2_year*
gen s2_yearx=year if s0_max==s2 & s0!=0 & s2!=0 
bysort ubigeo: egen s2_year=min(s2_yearx)
drop s2_yearx

lab var s0_year "Year implementation was started in district"
lab var s2_year "Year implementation was completed in district"

*------------------------------------------------------------------------------
*4) ADDITIONAL VARIABLES FOR APPENDIX
*------------------------------------------------------------------------------	

global mr "vs_imr1y vs_u5mr"
global censo "inei_pop geo_pop_dens05 dis_piped dis_hh_sewer dis_hh_onsite dis_hh_od dis_educ_secp dis_elecp"
global muni "renamu_pia renamu_int renamu_atform renamu_dhealth" 
global siaf "siaf_ejc_trans siaf_ejc_energ siaf_ejc_salud "

*Transform monetary values into real terms in USD 2010
//NOMINAL EXCHANGE RATE SOLES-USD FOR YEARS
//USA CPI  2010 and Exchange rate (World Bank Databank)
foreach var of varlist renamu_pia $siaf  {
	
	cap replace `var'=((`var'/3.296)/89.561)*100 if year==2005
	replace `var'=((`var'/3.274)/92.45)*100 if year==2006
	replace `var'=((`var'/3.128)/95.087)*100 if year==2007
	replace `var'=((`var'/2.924)/98.737)*100 if year==2008
	replace `var'=((`var'/3.012)/98.386)*100 if year==2009
	replace `var'=((`var'/2.825)/100)*100 if year==2010
	replace `var'=((`var'/2.754)/103.157)*100 if year==2011
	replace `var'=((`var'/2.638)/105.292)*100 if year==2012
	replace `var'=((`var'/2.702)/106.834)*100 if year==2013
	replace `var'=((`var'/2.839)/108.567)*100 if year==2014
	replace `var'=((`var'/3.184)/108.696)*100 if year==2015
	
	replace `var'=`var'/1000000
}


*Impute missings from province mean, keep original ('var'0), and flag missings
foreach var of varlist  $censo $muni {
clonevar `var'0=`var'
}

forvalues n=1/195 {
foreach var of varlist  $censo $muni {
cap drop `var'_miss
clonevar `var'_miss=`var'
replace `var'_miss=0
replace `var'_miss=1 if `var'==.

local lbl : variable label `var'
lab var `var'_miss "`lbl' (flag missings)"

sum `var' if prov_id==`n'
replace `var'=`r(mean)' if `var'==. & prov_id==`n'

}
}

*Replace with missings years with missing data
foreach var of varlist $siaf  {
sort ubigeo year
replace `var'=. if year<=2006 & year==2015 
}

*Transform SIAF expenditure data with inverse hyperbolic syne
foreach var of varlist $siaf  { 
gen `var'_ln=log(`var' + sqrt(`var'^2 + 1))
}

*MR change during study period
sort ubigeo year
sum s0_year if year==2015, det

tsset ubigeo_id year
foreach var of varlist vs_imr1y vs_u5mr {
gen `var'_chg=`var'-F10.`var'	

local lblchg : variable label `var'
lab var `var'_chg "`lblchg' (change in study period)"
}

foreach var of varlist $mr $muni inei_pop geo_pop_dens turnover {
	
gen `var'_prechg=`var'[_n+1] - `var'[_n]
replace `var'_prechg=. if year>s0_year
bysort ubigeo: egen `var'_prechgm=mean(`var'_prechg)

local lblprechg : variable label `var'
lab var `var'_prechg "`lblprechg' (pre-implementation annual change)"
lab var `var'_prechgm "`lblprechg' (Mean pre-implementation annual change)"
}


*Natural regions
gen geo_region=0
replace geo_region=1 if dep_id==4 | dep_id==15 | dep_id==11 | dep_id==13 | dep_id==14 | dep_id==18 | dep_id==20 | dep_id==23 | dep_id==24
replace geo_region=2 if dep_id==2 | dep_id==3 | dep_id==5 | dep_id==6 | dep_id==8 | dep_id==9 | dep_id==10 | dep_id==12 | dep_id==19 | dep_id==21
replace geo_region=3 if dep_id==1 | dep_id==16 | dep_id==17 | dep_id==22 | dep_id==25 
tab geo_region, gen(geo_region)


lab var geo_region "Natural regions (categories)"
lab var geo_region1 "Coast"
lab var geo_region2 "Highlands - Andes"
lab var geo_region3 "Rainforest - Amazon"

*Geographic specific trends
cap drop pscore*y *percy* *areay* 
cap drop pscore2y

foreach var of varlist geo_slope1_perc geo_slope2_perc geo_slope3_perc geo_slope4_perc geo_elev_250perc geo_elev_251500perc geo_elev_5011000perc geo_elev_1001perc river_leng geo_area pscore2 geo_region1 geo_region2 geo_region3 geo_pop_dens05 {
gen `var'y=`var'*year

local lbly : variable label `var'
lab var `var'y "`lbly' x Year"
}

*Extra variables for heterogeneity - Get values from 2005 census
foreach var of varlist dis_elecp dis_educ_secomp dis_educ_secp renamu_pia renamu_int renamu_atform renamu_dhealth pct_reelection_1993  {
	gen `var'05x=`var' if year==2005
	bysort ubigeo: egen `var'05=max(`var'05x)
	drop `var'05x
	
	local lblr : variable label `var'
	lab var `var'05 "`lblr' (2005 census)"
}


*------------------------------------------------------------------------------
*5) LABEL VARIABLES 
*------------------------------------------------------------------------------	

lab var ubigeo "District ID"
lab var year "Calendar year"

forvalues y=1/5 {
    lab var mort_`y'y "Deaths at `y'y"
}


lab var dis_piped "Share HH piped water"
lab var inei_pop "Population"
lab var dis_educ_secp "Share HH head has some secondary"
lab var dis_educ_secomp "Share HH head completed secondary"
lab var dis_elecp 		"Share HH electricity"
lab var siaf_ejc_trans "Transport expenditure (millions)"
lab var siaf_ejc_energ "Energy expenditure (millions)"
lab var siaf_ejc_salud "Health expenditure (millions)"

lab var siaf_ejc_trans_ln "Transport expenditure (log)"
lab var siaf_ejc_energ_ln "Energy expenditure (log)"
lab var siaf_ejc_salud_ln "Health expenditure (log)"

lab var mort_less1y "Deaths below 1y"
lab var mort_u5y "Deaths below 5y"
lab var pop_u5y "Population below 5y"
lab var pop_total "Population (census years)"
lab var inei_pop "Populatiom (forecasts)"


*------------------------------------------------------------------------------
*6) CREATE VARIABLES FOR EVENT STUDY
*------------------------------------------------------------------------------	

cap drop i t Ei K D lastcohort *event*

gen i = ubigeo_id	// unit id
gen t = year		// calendar period
tsset i t

lab var i "Unit id"
lab var t "Calendar period"

gen Ei = s0_year	// year when unit is first treated
bys i (t): replace Ei = Ei[1]
gen K = t-Ei 			// "relative time", i.e. the number periods since treated (could be missing if never-treated)
gen D = K>=0 & Ei!=. 	//treat_post, =0 even if never treated

lab var Ei "Year when unit is first treated"
lab var K "Relative time (t - Ei)"
lab var D "T x Post"

//Get leads and lags
sum Ei
gen lastcohort = Ei==r(max) // dummy for the latest- or never-treated cohort
lab var lastcohort "Last treated cohort"

forvalues l = 0/10 {
	gen L`l'event = K==`l'
	lab var L`l'event "Lag `l'"
}

forvalues l = 1/10 {
	gen F`l'event = K==-`l'
	lab var F`l'event "Lead `l'"
}

drop F1event // normalize K=-1 to zero	
gen F1event=0
lab var F1event "Lead 1"

save "$data/analysis_district_clean", replace



*------------------------------------------------------------------------------
*7) SET SAMPLE 
*------------------------------------------------------------------------------	

use "$data/analysis_district_clean", clear

*Sample
//Years 2005-2015
//AT LEAST 1 ROUND OF MORTALITY DATA
//SEWERAGE AND GEOGRAPHIC DATA
//AT LEAST 2 ROUNDS OF DATA (PANEL DATA - SINGLETON GROUPS EXCLUDED)
keep if year>=2005 & year<=2015
bysort ubigeo: egen vs_imr1y_max=max(vs_imr1y)
bysort ubigeo: egen vs_u5mr_max=max(vs_u5mr)
bysort ubigeo: egen rounds=count(vs_u5mr)
bysort ubigeo: egen rounds2=count(vs_imr1y) 

*Keep districts with mortality data (at least 2 rounds) + geo data + project data 
keep if (vs_imr1y_max!=. | vs_u5mr_max!=.) & s0!=. & geo_slope1_perc!=. & rounds>=2 & rounds!=. & rounds2>=2 & rounds2!=. 
unique ubigeo

drop vs_imr1y_max vs_u5mr_max rounds rounds2

*Year FE
cap drop year_*
tab year, gen(year_)

cap drop ubigeo_id
egen ubigeo_id = group(ubigeo) 
lab var ubigeo_id "District ID - numeric"

xtset ubigeo_id year

save "$data/analysis_district_clean_all", replace

*Erase temp datasets
erase "$data/vitalstats.dta"
erase "$data/district_setup.dta"
erase "$data/analysis_district.dta"


